library(tidyverse)
library(janitor)
library(knitr)
library(here)
library(patchwork) # For combining plots
# Optional packages
has_naniar <- suppressWarnings(require(naniar, quietly = TRUE))
has_skimr <- suppressWarnings(require(skimr, quietly = TRUE))
has_gt <- suppressWarnings(require(gt, quietly = TRUE))
# File paths using here() for robust path resolution
primary_file <- here("data", "grf_shooting_clean.csv")
fallback_file <- here("data", "GRF Shooting SM[40].csv")
# Load data with fallback
if (file.exists(primary_file)) {
df <- read_csv(primary_file, show_col_types = FALSE)
cat("Loaded primary file:", primary_file, "\n")
} else if (file.exists(fallback_file)) {
df <- read_csv(fallback_file, show_col_types = FALSE) %>%
janitor::clean_names()
cat("Loaded fallback file:", fallback_file, "\n")
cat("Applied janitor::clean_names()\n")
} else {
stop("ERROR: Neither data file found!\n",
"Expected: ", primary_file, " or ", fallback_file)
}
## Loaded primary file: /Users/samuelmontalvo/Documents/GitHub/basketball_shooting_project/data/grf_shooting_clean.csv
cat("\nDimensions:", nrow(df), "rows x", ncol(df), "columns\n")
##
## Dimensions: 460 rows x 18 columns
cat("Column names:\n")
## Column names:
cat(paste(names(df), collapse = "\n"))
## player_id
## shot_number
## shot_type
## made
## position
## height
## body_mass
## peak_braking_force
## peak_braking_power
## peak_propulsive_force
## peak_propulsive_power
## left_leg_peak_propulsive_force
## right_leg_peak_propulsive_force
## jump_height
## braking_duration
## propulsive_duration
## displacement_depth
## force_at_min_displacement
# Helper function to find columns by keyword
find_col <- function(cols, patterns) {
for (p in patterns) {
matches <- cols[str_detect(tolower(cols), tolower(p))]
if (length(matches) > 0) return(matches[1])
}
return(NA_character_)
}
# Find player_id
player_id_col <- find_col(names(df), c("^player_id$", "^player$", "subject", "athlete", "^pid$", "^id$"))
if (is.na(player_id_col)) {
stop("ERROR: Could not identify player_id column.\n",
"Expected column containing: player, pid, athlete, subject, or id")
}
# Rename to player_id if needed
if (player_id_col != "player_id") {
df <- df %>% rename(player_id = !!sym(player_id_col))
}
df <- df %>% mutate(player_id = as.character(player_id))
# Find shot_type
shot_type_col <- find_col(names(df), c("^shot_type$", "shot.*type", "type.*2.*3"))
has_shot_type <- !is.na(shot_type_col)
if (has_shot_type && shot_type_col != "shot_type") {
df <- df %>% rename(shot_type = !!sym(shot_type_col))
}
if (has_shot_type) {
df <- df %>%
mutate(shot_type = case_when(
shot_type %in% c(2, "2", "2PT", "2pt") ~ "2PT",
shot_type %in% c(3, "3", "3PT", "3pt") ~ "3PT",
TRUE ~ as.character(shot_type)
))
}
# Find made/result
made_col <- find_col(names(df), c("^made$", "miss.*made", "result", "outcome", "make"))
has_made <- !is.na(made_col)
if (has_made && made_col != "made") {
df <- df %>% rename(made = !!sym(made_col))
}
if (has_made) {
df <- df %>% mutate(made = as.integer(made))
}
# Report findings
cat("Column Mapping:\n")
## Column Mapping:
cat("─────────────────────────────────────\n")
## ─────────────────────────────────────
cat("player_id:", player_id_col, "→ player_id\n")
## player_id: player_id → player_id
cat("shot_type:", ifelse(has_shot_type, shot_type_col, "NOT FOUND"), "\n")
## shot_type: shot_type
cat("made: ", ifelse(has_made, made_col, "NOT FOUND"), "\n")
## made: made
missingness_by_col <- df %>%
summarise(across(everything(), ~ sum(is.na(.x)))) %>%
pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
mutate(
n_total = nrow(df),
pct_missing = round(100 * n_missing / n_total, 2)
) %>%
arrange(desc(pct_missing))
kable(missingness_by_col, caption = "Missingness by Column")
| column | n_missing | n_total | pct_missing |
|---|---|---|---|
| player_id | 0 | 460 | 0 |
| shot_number | 0 | 460 | 0 |
| shot_type | 0 | 460 | 0 |
| made | 0 | 460 | 0 |
| position | 0 | 460 | 0 |
| height | 0 | 460 | 0 |
| body_mass | 0 | 460 | 0 |
| peak_braking_force | 0 | 460 | 0 |
| peak_braking_power | 0 | 460 | 0 |
| peak_propulsive_force | 0 | 460 | 0 |
| peak_propulsive_power | 0 | 460 | 0 |
| left_leg_peak_propulsive_force | 0 | 460 | 0 |
| right_leg_peak_propulsive_force | 0 | 460 | 0 |
| jump_height | 0 | 460 | 0 |
| braking_duration | 0 | 460 | 0 |
| propulsive_duration | 0 | 460 | 0 |
| displacement_depth | 0 | 460 | 0 |
| force_at_min_displacement | 0 | 460 | 0 |
p_missing <- missingness_by_col %>%
filter(pct_missing > 0 | row_number() <= 10) %>%
ggplot(aes(x = reorder(column, pct_missing), y = pct_missing)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Missing Data by Column",
x = NULL,
y = "% Missing"
) +
theme_minimal()
print(p_missing)
n_duplicates <- sum(duplicated(df))
cat("Number of fully duplicated rows:", n_duplicates, "\n")
## Number of fully duplicated rows: 0
if ("shot_number" %in% names(df) && has_shot_type) {
dup_trials <- df %>%
group_by(player_id, shot_type, shot_number) %>%
filter(n() > 1) %>%
nrow()
cat("Duplicate (player_id, shot_type, shot_number) combinations:", dup_trials, "\n")
}
## Duplicate (player_id, shot_type, shot_number) combinations: 0
trials_per_player <- df %>%
count(player_id, name = "n_trials") %>%
arrange(player_id)
kable(trials_per_player, caption = "Trials per Player")
| player_id | n_trials |
|---|---|
| 1 | 20 |
| 10 | 20 |
| 11 | 20 |
| 12 | 20 |
| 13 | 20 |
| 14 | 20 |
| 15 | 20 |
| 16 | 20 |
| 17 | 20 |
| 18 | 20 |
| 19 | 20 |
| 2 | 20 |
| 20 | 20 |
| 21 | 20 |
| 22 | 20 |
| 23 | 20 |
| 3 | 20 |
| 4 | 20 |
| 5 | 20 |
| 6 | 20 |
| 7 | 20 |
| 8 | 20 |
| 9 | 20 |
trials_by_type <- df %>%
count(player_id, shot_type) %>%
pivot_wider(names_from = shot_type, values_from = n, values_fill = 0) %>%
mutate(
total = `2PT` + `3PT`,
balanced = ifelse(`2PT` == 10 & `3PT` == 10, "✓", "✗")
)
kable(trials_by_type, caption = "Trials by Shot Type (Expected: 10/10)")
| player_id | 2PT | 3PT | total | balanced |
|---|---|---|---|---|
| 1 | 10 | 10 | 20 | ✓ |
| 10 | 10 | 10 | 20 | ✓ |
| 11 | 10 | 10 | 20 | ✓ |
| 12 | 10 | 10 | 20 | ✓ |
| 13 | 10 | 10 | 20 | ✓ |
| 14 | 10 | 10 | 20 | ✓ |
| 15 | 10 | 10 | 20 | ✓ |
| 16 | 10 | 10 | 20 | ✓ |
| 17 | 10 | 10 | 20 | ✓ |
| 18 | 10 | 10 | 20 | ✓ |
| 19 | 10 | 10 | 20 | ✓ |
| 2 | 10 | 10 | 20 | ✓ |
| 20 | 10 | 10 | 20 | ✓ |
| 21 | 10 | 10 | 20 | ✓ |
| 22 | 10 | 10 | 20 | ✓ |
| 23 | 10 | 10 | 20 | ✓ |
| 3 | 10 | 10 | 20 | ✓ |
| 4 | 10 | 10 | 20 | ✓ |
| 5 | 10 | 10 | 20 | ✓ |
| 6 | 10 | 10 | 20 | ✓ |
| 7 | 10 | 10 | 20 | ✓ |
| 8 | 10 | 10 | 20 | ✓ |
| 9 | 10 | 10 | 20 | ✓ |
complete_players <- sum(trials_by_type$balanced == "✓")
cat("\nPlayers with complete data (10 2PT + 10 3PT):", complete_players, "of", nrow(trials_by_type), "\n")
##
## Players with complete data (10 2PT + 10 3PT): 23 of 23
# Identify numeric columns
exclude_cols <- c("player_id", "shot_type", "made", "position")
numeric_cols <- df %>%
select(-any_of(exclude_cols)) %>%
select(where(is.numeric)) %>%
names()
cat("Identified", length(numeric_cols), "numeric columns\n")
## Identified 14 numeric columns
# Create summary table
numeric_summary <- df %>%
select(all_of(numeric_cols)) %>%
summarise(across(everything(), list(
mean = ~ mean(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
iqr = ~ IQR(.x, na.rm = TRUE),
min = ~ min(.x, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
p1 = ~ quantile(.x, 0.01, na.rm = TRUE),
p99 = ~ quantile(.x, 0.99, na.rm = TRUE),
pct_na = ~ round(100 * sum(is.na(.x)) / length(.x), 2)
))) %>%
pivot_longer(
everything(),
names_to = c("variable", "stat"),
names_pattern = "^(.*)_(mean|sd|median|iqr|min|max|p1|p99|pct_na)$"
) %>%
pivot_wider(names_from = stat, values_from = value) %>%
mutate(across(where(is.numeric), ~ round(.x, 2))) %>%
arrange(variable)
kable(numeric_summary, caption = "Numeric Variable Summary Statistics")
| variable | mean | sd | median | iqr | min | max | p1 | p99 | pct_na |
|---|---|---|---|---|---|---|---|---|---|
| body_mass | 91.29 | 12.77 | 90.68 | 12.07 | 68.88 | 123.97 | 68.88 | 123.97 | 0 |
| braking_duration | 0.42 | 0.47 | 0.29 | 0.12 | 0.08 | 2.81 | 0.11 | 2.69 | 0 |
| displacement_depth | 21.95 | 19.42 | 17.70 | 8.05 | 0.00 | 184.80 | 0.79 | 100.73 | 0 |
| force_at_min_displacement | 1500.07 | 381.29 | 1518.00 | 396.50 | 7.90 | 2643.00 | 11.20 | 2333.87 | 0 |
| height | 195.35 | 6.49 | 195.00 | 10.00 | 180.00 | 206.00 | 180.00 | 206.00 | 0 |
| jump_height | 9.69 | 5.05 | 8.90 | 7.40 | 1.30 | 27.10 | 1.60 | 23.08 | 0 |
| left_leg_peak_propulsive_force | 1048.32 | 201.98 | 1018.00 | 302.00 | 609.00 | 1567.00 | 653.95 | 1513.97 | 0 |
| peak_braking_force | 1532.33 | 308.83 | 1520.50 | 378.75 | 827.00 | 2643.00 | 979.16 | 2333.87 | 0 |
| peak_braking_power | 601.14 | 292.79 | 594.50 | 379.25 | -0.14 | 2131.00 | 59.36 | 1540.15 | 0 |
| peak_propulsive_force | 1959.21 | 343.43 | 1911.50 | 529.75 | 1099.00 | 2914.00 | 1194.96 | 2827.27 | 0 |
| peak_propulsive_power | 2753.89 | 734.59 | 2711.00 | 975.25 | 544.00 | 7311.00 | 1326.22 | 4580.10 | 0 |
| propulsive_duration | 0.31 | 0.25 | 0.29 | 0.08 | 0.14 | 2.71 | 0.18 | 1.47 | 0 |
| right_leg_peak_propulsive_force | 940.41 | 167.93 | 926.50 | 207.25 | 568.00 | 1513.00 | 603.85 | 1421.17 | 0 |
| shot_number | 5.50 | 2.88 | 5.50 | 5.00 | 1.00 | 10.00 | 1.00 | 10.00 | 0 |
global_outliers <- tibble(
column = character(),
n_outliers = integer(),
pct_outliers = numeric()
)
for (col in numeric_cols) {
x <- df[[col]]
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
n_out <- sum(x < lower | x > upper, na.rm = TRUE)
pct_out <- round(100 * n_out / sum(!is.na(x)), 2)
global_outliers <- global_outliers %>%
add_row(column = col, n_outliers = n_out, pct_outliers = pct_out)
}
within_player_outliers <- tibble(
column = character(),
n_outliers = integer()
)
# Players with >= 5 trials
valid_players <- df %>%
count(player_id) %>%
filter(n >= 5) %>%
pull(player_id)
df_valid <- df %>% filter(player_id %in% valid_players)
for (col in numeric_cols) {
z_scores <- df_valid %>%
group_by(player_id) %>%
mutate(
col_mean = mean(.data[[col]], na.rm = TRUE),
col_sd = sd(.data[[col]], na.rm = TRUE),
z = ifelse(col_sd > 0, (.data[[col]] - col_mean) / col_sd, 0)
) %>%
ungroup() %>%
pull(z)
n_out <- sum(abs(z_scores) > 3, na.rm = TRUE)
within_player_outliers <- within_player_outliers %>%
add_row(column = col, n_outliers = n_out)
}
outlier_summary <- global_outliers %>%
rename(global_outliers = n_outliers, global_pct = pct_outliers) %>%
left_join(
within_player_outliers %>% rename(within_player_outliers = n_outliers),
by = "column"
) %>%
arrange(desc(global_outliers + within_player_outliers))
kable(outlier_summary, caption = "Outlier Summary by Column")
| column | global_outliers | global_pct | within_player_outliers |
|---|---|---|---|
| displacement_depth | 59 | 12.83 | 8 |
| braking_duration | 54 | 11.74 | 9 |
| body_mass | 40 | 8.70 | 0 |
| force_at_min_displacement | 18 | 3.91 | 3 |
| propulsive_duration | 11 | 2.39 | 6 |
| peak_braking_force | 10 | 2.17 | 4 |
| peak_braking_power | 7 | 1.52 | 4 |
| right_leg_peak_propulsive_force | 10 | 2.17 | 0 |
| peak_propulsive_power | 4 | 0.87 | 3 |
| jump_height | 3 | 0.65 | 0 |
| shot_number | 0 | 0.00 | 0 |
| height | 0 | 0.00 | 0 |
| peak_propulsive_force | 0 | 0.00 | 0 |
| left_leg_peak_propulsive_force | 0 | 0.00 | 0 |
key_keywords <- c("force", "power", "jump", "brak", "propul", "duration", "depth")
# Get keyword matches
keyword_matches <- numeric_cols[str_detect(tolower(numeric_cols),
paste(key_keywords, collapse = "|"))]
# Get top 10 by variance
var_ranking <- df %>%
select(all_of(numeric_cols)) %>%
summarise(across(everything(), ~ var(.x, na.rm = TRUE))) %>%
pivot_longer(everything(), names_to = "column", values_to = "variance") %>%
arrange(desc(variance))
top_by_variance <- var_ranking %>% slice_head(n = 10) %>% pull(column)
# Combine
key_vars <- unique(c(keyword_matches, top_by_variance))
key_vars <- key_vars[1:min(12, length(key_vars))]
cat("Key variables selected for visualization:\n")
## Key variables selected for visualization:
cat(paste(key_vars, collapse = "\n"))
## peak_braking_force
## peak_braking_power
## peak_propulsive_force
## peak_propulsive_power
## left_leg_peak_propulsive_force
## right_leg_peak_propulsive_force
## jump_height
## braking_duration
## propulsive_duration
## displacement_depth
## force_at_min_displacement
## body_mass
for (var in key_vars) {
p <- df %>%
ggplot(aes(x = .data[[var]])) +
geom_histogram(aes(y = after_stat(density)), bins = 30,
fill = "steelblue", alpha = 0.7) +
geom_density(color = "darkblue", linewidth = 1) +
labs(title = paste("Distribution:", var), x = var, y = "Density") +
theme_minimal()
print(p)
}
# =============================================================================
# FIGURE 1: Combined Publication-Ready Figure
# Panel A: Make Rate by Shot Type
# Panel B: Force Plate Variables by Shot Type
# =============================================================================
# --- Panel A: Make Rate by Shot Type ---
make_by_type <- df %>%
group_by(shot_type) %>%
summarise(
n = n(),
makes = sum(made, na.rm = TRUE),
make_rate = mean(made, na.rm = TRUE) * 100,
se = sqrt(make_rate * (100 - make_rate) / n),
.groups = "drop"
)
overall_make_rate <- mean(df$made, na.rm = TRUE) * 100
p_make_rate <- make_by_type %>%
ggplot(aes(x = shot_type, y = make_rate, fill = shot_type)) +
geom_col(alpha = 0.85, width = 0.6) +
geom_errorbar(aes(ymin = make_rate - 1.96 * se, ymax = make_rate + 1.96 * se),
width = 0.15, linewidth = 0.8) +
geom_hline(yintercept = overall_make_rate, linetype = "dashed",
color = "gray40", linewidth = 0.7) +
geom_label(aes(label = paste0(round(make_rate, 1), "%"), y = make_rate + 1.96 * se + 5),
size = 5, fontface = "bold", fill = "white", label.size = 0) +
scale_fill_manual(values = c("2PT" = "#1f77b4", "3PT" = "#ff7f0e")) +
scale_y_continuous(limits = c(0, 95), breaks = seq(0, 100, 20),
expand = expansion(mult = c(0, 0.08))) +
labs(
x = "Shot Type",
y = "Make Rate (%)"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "none",
axis.title = element_text(face = "bold"),
axis.text = element_text(size = 11),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
# --- Panel B: Force Plate Variables by Shot Type ---
df_long <- df %>%
select(player_id, shot_type, all_of(key_vars)) %>%
pivot_longer(
cols = all_of(key_vars),
names_to = "variable",
values_to = "value"
) %>%
mutate(
# Clean up variable names and add units
variable_label = case_when(
str_detect(variable, "jump_height") ~ "Jump Height (cm)",
str_detect(variable, "peak_propulsive_force") ~ "Peak Propulsive Force (N)",
str_detect(variable, "peak_propulsive_power") ~ "Peak Propulsive Power (W)",
str_detect(variable, "peak_braking_force") ~ "Peak Braking Force (N)",
str_detect(variable, "peak_braking_power") ~ "Peak Braking Power (W)",
str_detect(variable, "body_mass") ~ "Body Mass (kg)",
str_detect(variable, "braking_duration") ~ "Braking Duration (s)",
str_detect(variable, "propulsive_duration") ~ "Propulsive Duration (s)",
str_detect(variable, "displacement_depth") ~ "Displacement Depth (cm)",
str_detect(variable, "force_at_min_displacement") ~ "Force at Min Displacement (N)",
str_detect(variable, "left_leg.*force") ~ "Left Leg Peak Propulsive Force (N)",
str_detect(variable, "right_leg.*force") ~ "Right Leg Peak Propulsive Force (N)",
TRUE ~ str_replace_all(variable, "_", " ") %>% str_to_title()
)
)
p_violins <- df_long %>%
ggplot(aes(x = shot_type, y = value, fill = shot_type)) +
geom_violin(alpha = 0.5, trim = FALSE, linewidth = 0.3) +
geom_boxplot(width = 0.15, alpha = 0.9, outlier.shape = NA, linewidth = 0.4) +
scale_fill_manual(values = c("2PT" = "#1f77b4", "3PT" = "#ff7f0e"),
labels = c("2PT" = "2-Point", "3PT" = "3-Point")) +
facet_wrap(~ variable_label, scales = "free_y", ncol = 3) +
labs(
x = "Shot Type",
y = "Value",
fill = "Shot Type"
) +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
legend.title = element_text(face = "bold"),
strip.text = element_text(face = "bold", size = 10),
strip.background = element_rect(fill = "gray95", color = NA),
panel.spacing = unit(0.8, "lines"),
axis.text.x = element_text(size = 9),
axis.title = element_text(face = "bold"),
panel.grid.minor = element_blank()
)
# --- Combine Panels with patchwork ---
fig1_combined <- p_make_rate / p_violins +
plot_layout(heights = c(1, 3)) +
plot_annotation(
tag_levels = "A",
theme = theme(
plot.tag = element_text(face = "bold", size = 16)
)
)
print(fig1_combined)
# --- Save publication-ready figure ---
fig_dir <- here("figures")
if (!dir.exists(fig_dir)) dir.create(fig_dir, recursive = TRUE)
ggsave(
filename = here("figures", "figure1_shot_performance_biomechanics.png"),
plot = fig1_combined,
width = 12,
height = 14,
dpi = 300,
bg = "white"
)
ggsave(
filename = here("figures", "figure1_shot_performance_biomechanics.pdf"),
plot = fig1_combined,
width = 12,
height = 14,
bg = "white"
)
cat("\nFigure 1 saved to figures/figure1_shot_performance_biomechanics.png and .pdf\n")
##
## Figure 1 saved to figures/figure1_shot_performance_biomechanics.png and .pdf
# Individual view of violin plots for reference
print(p_violins +
labs(title = "Force Plate Variables by Shot Type",
subtitle = "Violin plots with embedded boxplots comparing 2PT vs 3PT shots"))
jump_col <- find_col(numeric_cols, c("jump_height", "jump", "height"))
has_jump <- !is.na(jump_col)
if (has_jump) {
cat("Jump height variable identified:", jump_col, "\n")
} else {
cat("No jump height variable found\n")
}
## Jump height variable identified: jump_height
player_means <- df %>%
group_by(player_id, shot_type) %>%
summarise(mean_value = mean(.data[[jump_col]], na.rm = TRUE), .groups = "drop")
player_wide <- player_means %>%
pivot_wider(names_from = shot_type, values_from = mean_value)
p_paired <- player_wide %>%
ggplot(aes(x = `2PT`, y = `3PT`)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +
geom_point(size = 4, alpha = 0.7, color = "steelblue") +
labs(
title = paste("Player Mean", jump_col, ": 2PT vs 3PT"),
subtitle = "Points above dashed line = higher 3PT than 2PT",
x = "Mean 2PT Jump Height",
y = "Mean 3PT Jump Height"
) +
theme_minimal() +
coord_equal()
print(p_paired)
p_spaghetti <- player_means %>%
ggplot(aes(x = shot_type, y = mean_value, group = player_id, color = player_id)) +
geom_line(alpha = 0.7, linewidth = 1) +
geom_point(size = 3) +
labs(
title = paste("Player-Level Mean", jump_col, "by Shot Type"),
x = "Shot Type",
y = paste("Mean", jump_col),
color = "Player"
) +
theme_minimal()
print(p_spaghetti)
cor_matrix <- df %>%
select(all_of(numeric_cols)) %>%
cor(use = "complete.obs")
# Convert to long format
cor_long <- cor_matrix %>%
as.data.frame() %>%
rownames_to_column("var1") %>%
pivot_longer(-var1, names_to = "var2", values_to = "correlation")
p_cor <- cor_long %>%
ggplot(aes(x = var1, y = var2, fill = correlation)) +
geom_tile() +
scale_fill_gradient2(
low = "#2166ac", mid = "white", high = "#b2182b",
midpoint = 0, limits = c(-1, 1)
) +
labs(title = "Correlation Matrix", x = "", y = "") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
axis.text.y = element_text(size = 7)
)
print(p_cor)
strong_cors <- cor_long %>%
filter(var1 < var2) %>%
filter(abs(correlation) > 0.7) %>%
arrange(desc(abs(correlation))) %>%
mutate(correlation = round(correlation, 3))
kable(strong_cors %>% head(20), caption = "Strongest Correlations (|r| > 0.7)")
| var1 | var2 | correlation |
|---|---|---|
| left_leg_peak_propulsive_force | peak_propulsive_force | 0.923 |
| force_at_min_displacement | peak_braking_force | 0.877 |
| peak_propulsive_force | right_leg_peak_propulsive_force | 0.876 |
| peak_braking_force | peak_propulsive_force | 0.711 |
overall_make_rate <- mean(df$made, na.rm = TRUE)
cat("Overall make rate:", round(overall_make_rate * 100, 1), "%\n")
## Overall make rate: 64.1 %
See Figure 1 (Panel A) above for publication-ready visualization.
# Summary table
make_by_type_table <- df %>%
group_by(shot_type) %>%
summarise(
N = n(),
Makes = sum(made, na.rm = TRUE),
Misses = N - Makes,
`Make Rate (%)` = round(mean(made, na.rm = TRUE) * 100, 1),
.groups = "drop"
)
kable(make_by_type_table, caption = "Make Rate by Shot Type")
| shot_type | N | Makes | Misses | Make Rate (%) |
|---|---|---|---|---|
| 2PT | 230 | 167 | 63 | 72.6 |
| 3PT | 230 | 128 | 102 | 55.7 |
make_by_player <- df %>%
group_by(player_id) %>%
summarise(
n = n(),
makes = sum(made, na.rm = TRUE),
make_rate = round(mean(made, na.rm = TRUE) * 100, 1),
.groups = "drop"
) %>%
arrange(desc(make_rate))
kable(make_by_player, caption = "Make Rate by Player (Ranked)")
| player_id | n | makes | make_rate |
|---|---|---|---|
| 16 | 20 | 16 | 80 |
| 17 | 20 | 16 | 80 |
| 9 | 20 | 16 | 80 |
| 13 | 20 | 15 | 75 |
| 21 | 20 | 15 | 75 |
| 3 | 20 | 15 | 75 |
| 2 | 20 | 14 | 70 |
| 22 | 20 | 14 | 70 |
| 1 | 20 | 13 | 65 |
| 15 | 20 | 13 | 65 |
| 19 | 20 | 13 | 65 |
| 6 | 20 | 13 | 65 |
| 11 | 20 | 12 | 60 |
| 12 | 20 | 12 | 60 |
| 14 | 20 | 12 | 60 |
| 20 | 20 | 12 | 60 |
| 5 | 20 | 12 | 60 |
| 7 | 20 | 12 | 60 |
| 18 | 20 | 11 | 55 |
| 4 | 20 | 11 | 55 |
| 10 | 20 | 10 | 50 |
| 23 | 20 | 9 | 45 |
| 8 | 20 | 9 | 45 |
p_make_player <- make_by_player %>%
ggplot(aes(x = reorder(player_id, make_rate), y = make_rate)) +
geom_col(fill = "steelblue", alpha = 0.8) +
geom_hline(yintercept = overall_make_rate * 100, linetype = "dashed", color = "red") +
coord_flip() +
labs(title = "Make Rate by Player",
subtitle = paste("Red line = overall (", round(overall_make_rate * 100, 1), "%)"),
x = "Player ID", y = "Make Rate (%)") +
theme_minimal()
print(p_make_player)
p_jump_make <- df %>%
ggplot(aes(x = .data[[jump_col]], y = made)) +
geom_jitter(alpha = 0.4, height = 0.1, width = 0, color = "steelblue") +
geom_smooth(method = "loess", se = TRUE, color = "red", fill = "red", alpha = 0.2) +
labs(
title = paste("Make Probability vs", jump_col, "(Descriptive Only)"),
subtitle = "Red line = LOESS smooth",
x = jump_col,
y = "Made (0/1)"
) +
theme_minimal()
print(p_jump_make)
cat("## Summary\n\n")
# Completeness
cat("### Data Completeness\n\n")
high_missing <- missingness_by_col %>% filter(pct_missing > 5)
if (nrow(high_missing) > 0) {
cat("- **Columns with >5% missing:**", paste(high_missing$column, collapse = ", "), "\n")
} else {
cat("- ✓ No columns with >5% missing data\n")
}
incomplete <- trials_per_player %>% filter(n_trials != 20)
if (nrow(incomplete) > 0) {
cat("- **Players with incomplete trials:**", paste(incomplete$player_id, collapse = ", "), "\n")
} else {
cat("- ✓ All players have expected 20 trials\n")
}
# Artifact detection
cat("\n### Potential Artifact Variables\n\n")
artifact_flags <- numeric_summary %>%
mutate(p99_p1_ratio = ifelse(p1 != 0 & !is.na(p1), p99 / p1, NA)) %>%
filter(p99_p1_ratio > 100 | min < 0 | is.na(p99_p1_ratio))
if (nrow(artifact_flags) > 0) {
cat("- **Flagged variables:**", paste(artifact_flags$variable, collapse = ", "), "\n")
cat(" - (Extreme p99/p1 ratios or negative minimums)\n")
} else {
cat("- ✓ No obvious artifact variables detected\n")
}
# Stability
cat("\n### Variable Stability\n\n")
cv_summary <- numeric_summary %>%
mutate(cv = sd / abs(mean)) %>%
arrange(cv)
stable_vars <- cv_summary %>% filter(cv < 0.3) %>% pull(variable)
noisy_vars <- cv_summary %>% filter(cv > 0.5) %>% pull(variable)
cat("- **Most stable (CV < 0.3):**", paste(head(stable_vars, 5), collapse = ", "), "\n")
cat("- **Most variable (CV > 0.5):**", paste(head(noisy_vars, 5), collapse = ", "), "\n")
# Collinearity
cat("\n### Collinearity Patterns\n\n")
if (nrow(strong_cors) > 0) {
cat("- Found **", nrow(strong_cors), " variable pairs** with |r| > 0.7\n")
cat("- **Top correlations:**\n")
top_cors <- strong_cors %>% slice_head(n = 5)
for (i in 1:nrow(top_cors)) {
cat(" - ", top_cors$var1[i], " ↔ ", top_cors$var2[i],
" (r = ", top_cors$correlation[i], ")\n", sep = "")
}
} else {
cat("- No variable pairs with |r| > 0.7\n")
}
# Shot type differences
if (has_shot_type) {
cat("\n### Shot Type Observations\n\n")
cat("- Review violin/boxplots for systematic 2PT vs 3PT differences\n")
cat("- Jump height is typically higher for 3PT shots (greater distance)\n")
}
# Make rate
if (has_made) {
cat("\n### Make Rate Observations\n\n")
cat("- **Overall make rate:**", round(overall_make_rate * 100, 1), "%\n")
cat("- **Player variation:**", round(min(make_by_player$make_rate), 1), "% –",
round(max(make_by_player$make_rate), 1), "%\n")
}
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gt_1.3.0 skimr_2.2.2 naniar_1.1.0 patchwork_1.3.2
## [5] here_1.0.2 knitr_1.51 janitor_2.2.1 lubridate_1.9.5
## [9] forcats_1.0.1 stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1
## [13] readr_2.1.6 tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.2
## [17] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.10.0 visdat_0.6.0
## [5] lattice_0.22-9 tzdb_0.5.0 vctrs_0.7.1 tools_4.5.1
## [9] generics_0.1.4 parallel_4.5.1 pkgconfig_2.0.3 Matrix_1.7-4
## [13] RColorBrewer_1.1-3 S7_0.2.1 lifecycle_1.0.5 compiler_4.5.1
## [17] farver_2.1.2 textshaping_1.0.4 repr_1.1.7 snakecase_0.11.1
## [21] htmltools_0.5.9 sass_0.4.10 yaml_2.3.12 pillar_1.11.1
## [25] crayon_1.5.3 jquerylib_0.1.4 cachem_1.1.0 nlme_3.1-168
## [29] tidyselect_1.2.1 digest_0.6.39 stringi_1.8.7 labeling_0.4.3
## [33] splines_4.5.1 rprojroot_2.1.1 fastmap_1.2.0 grid_4.5.1
## [37] cli_3.6.5 magrittr_2.0.4 base64enc_0.1-6 withr_3.0.2
## [41] scales_1.4.0 bit64_4.6.0-1 timechange_0.4.0 rmarkdown_2.30
## [45] bit_4.6.0 otel_0.2.0 ragg_1.5.0 hms_1.1.4
## [49] evaluate_1.0.5 mgcv_1.9-4 rlang_1.1.7 glue_1.8.0
## [53] xml2_1.5.2 vroom_1.7.0 jsonlite_2.0.0 R6_2.6.1
## [57] systemfonts_1.3.1 fs_1.6.6